clear all
set maxvar 10000
set more off

* set up the working directory in your computer
cd 

* =============================================================
* GSS Data Setup
* =============================================================

use "GSS7212_R2.DTA", clear


* =============================================================
* Identifying all children in the household roster
* =============================================================

** Identifier : Child : relate1-14 = 3
* NOTE relate11-14 : (visitor) relationship to household heads

forvalues i = 1/14{
recode relate`i' (3=1) (else=0), gen(x_child`i')
replace x_child`i' = . if missing(relate`i')
gen x_age`i' = old`i' if x_child`i' == 1
gen x_gender`i' = gender`i' if x_child`i' == 1
}

* Oldest and youngest child's age
egen x_oldage = rowmax(x_age*)
egen x_youngage = rowmin(x_age*)

label var x_oldage "Age Eldest Child"
label var x_youngage "Age Youngest Child"

gen x_diffage = x_oldage-x_youngage
label var x_diffage "Age Gap(Eldest-Youngest)"

* Generational Age Gap
gen x_gengap = age-x_oldage
label var x_gengap "Generational Age Gap"

* Sex of the oldest child
gen x_oldsex =.
forvalues i = 1/14{
replace x_oldsex = x_gender`i' if (x_oldage == x_age`i') & ~missing(x_oldage) & ~missing(x_age`i')
}

* Detection of the same age among children
forvalues i = 1/14{
gen x_oldsameage`i'= 1 if (x_oldage == x_age`i') & ~missing(x_oldage) & ~missing(x_age`i')
}

egen old_sameage = rowtotal(x_oldsameage*)
label var old_sameage "N of same-age oldest in home"


* =====================================================================
* Major Independent Variables
* =====================================================================

* The # and % of daughters
egen nx_son = anycount(x_gender*), values(1)
egen nx_daughter = anycount(x_gender*), values(2)
gen nx_child = nx_son+nx_daughter

gen px_son = nx_son / nx_child
gen px_daughter = nx_daughter / nx_child

* Indicator for the sex of the oldest child
gen fx_daughter = x_oldsex == 2 if ~missing(x_oldsex)
gen fx_son = x_oldsex == 1 if ~missing(x_oldsex)

label var nx_child "N of Children"
label var nx_son "N of Son"
label var nx_daughter "N of Daughter"
label var px_son "% of Son"
label var px_daughter "% of Daughter"
label var fx_son "Oldest Son"
label var fx_daughter "Oldest Daughter"

* =====================================================================
* Dependent Variables
* =====================================================================
** Dep1 : partyID
recode partyid (0=-3) (1=-2) (2=-1) (3=0) (4=1) (5=2) (6=3) (7=0), gen(d_repscale)
recode partyid (5/6=1) (else=0), gen(d_rep)
replace d_rep = . if missing(partyid)
recode partyid (0/1=1) (else=0), gen(d_dem)
replace d_dem = . if missing(partyid)

label var d_repscale "Republican scale (-3 to 3)"
label var d_rep "Republican(=1)"
label var d_dem "Democrat(=1)"

* Dep2 : political ideology
gen d_conscale = polviews - 4
gen d_conscale100 = d_conscale*100/3

recode polviews (6/7=1) (else=0), gen(d_con)
replace d_con = . if missing(polviews)
recode polviews (1/2=1) (else=0), gen(d_lib)
replace d_lib = . if missing(polviews)

label var d_conscale "Conservative Scale(-3 to 3)"
label var d_con "Conservative(=1)"
label var d_lib "Liveral(=1)"

* =====================================================================
* Control Variables
* =====================================================================

* Age-cohort-period
gen c_age = age
label var c_age "Age"

* Gender
gen c_female = (sex==2) if ~missing(sex)
label var c_female "Sex:Female"

* Race
gen c_white = (race==1) if ~missing(race)
gen c_black = (race==2) if ~missing(race)
gen c_others = (race==3) if ~missing(race)
label var c_white "Race:White"
label var c_black "Race:Black"
label var c_others "Race:others"

* Education
gen c_educ = educ if ~missing(educ)
label var c_educ "Years of Education"

* Socio economic status index
gen c_sei= sei if ~missing(sei) 
label var c_sei "R's SES index"

* Number of all biological childs
gen c_childs = childs
label var c_childs "N of all biological childs"

* Household Size
gen x_hsize = hompop if ~missing(hompop)
label var x_hsize "Household Size"
	
* Country of origin
gen c_born = (born==1) if ~missing(born)
label var c_born "R born in US(=1)"

* =====================================================================
* in the 1994 GSS sample -- including information about all biological children
* =====================================================================

forvalues i = 1/9{
gen x_bdaughter`i' = kdsex`i' == 2 if ~missing(kdsex`i') & kdrel`i' == 1 & ~missing(kdrel`i')
}

gen fx_bdaughter = .
forvalues i = 1/9{
replace fx_bdaughter = x_bdaughter`i' if fx_bdaughter ==.
}

egen nx_bdaughter = rowtotal(x_bdaughter*)
egen nx_bchild = rownonmiss(x_bdaughter*)
gen px_bdaughter = nx_bdaughter / nx_bchild

forvalues i = 1/9{
gen x_boldest`i' = (1994-kdyrbrn`i') if kdrel`i' == 1 & ~missing(kdrel`i')
replace x_boldest`i' = . if kdyrbrn`i'>1994
}

egen x_bageoldest = rowmax(x_boldest*)
gen x_bgengap = age - x_bageoldest


label var nx_bdaughter "N bio daughters"
label var nx_bchild "N bio children"
label var fx_bdaughter "First bio = Daughter"
label var px_bdaughter  "% Daughters"

label var x_bgengap "Generation Age Gap"

* =====================================================================
* Sample Restriction
* =====================================================================

gen out_sample = 0

** [samp] only consider R's relationship with household is head only (exclude spouse)
replace out_sample = 1 if ~(rplace == 1) & out_sample == 0 

** [samp] exclude following cases : R has no children
replace out_sample = 1 if out_sample == 0 & (nx_child == 0) // no cohab kid

** [samp] generational age gap
replace out_sample = 2 if out_sample == 0 & x_gengap < 10

* two more oldest child sex is different : 33 cases
replace out_sample = 3 if out_sample == 0 & old_sameage > 1

** [samp] other restrictions
* NOTE. Concern : Missing values in the age of children 
* NOTE. Concern : Missing values in the gender of children 
egen n_child = rowtotal(x_child*)
egen x_mage = rownonmiss(x_age*)
replace x_mage = x_mage / n_child
egen x_msex = rownonmiss(x_gender*)
replace x_msex = x_msex / n_child
replace out_sample = 4 if out_sample == 0 & (x_mage < 1 | x_msex < 1)

* SAMPLE Sensitivity check
replace out_sample = 98 if out_sample == 0 & x_oldage > 16
replace out_sample = 99 if out_sample == 0 & (nx_child != childs) & ~missing(nx_child) & ~missing(childs)
recode out_sample 0=100


#delimit ;
label define out_sample
1 "R has no children"
2 "Generational age gap < 10"
3 "R has two or more children who are in the same age"
4 "Missing in any child's age or gender"
5 "Missing in party ID" 
98 "R's oldest child is older than 16"
99 "# of bio kids != # of cohab-kids"
100 "Analytic Sample"
;
#delimit cr

label value out_sample out_sample

tab out_sample

local cv x_oldage x_gengap x_bgengap x_bageoldest childs x_hsize c_age c_female c_born c_educ c_sei
local dv d_conscale d_conscale100 d_repscale d_dem d_rep
local iv fx_daughter nx_daughter px_daughter nx_child nx_bdaughter nx_bchild px_bdaughter fx_bdaughter
local etc out_sample year id wtssall sampcode
keep `cv' `cv2' `dv' `iv' `etc'

gen dataset = "GSS"
sort year id
gen pid = _n // person id

compress
save gss_daughter_analytic, replace


* =============================================================
* ESS Data Setup
* =============================================================

* Combine six waves of ESS cross sectional data into one data 
/*
clear

* set up the working directory in your computer
cd 

fdause ess1e06_3.xpt
compress
save ess1, replace

fdause ess2e03_3.xpt
compress
save ess2, replace

fdause ess3e03_4.xpt
compress
save ess3, replace

fdause ess4e04_1.xpt
compress
save ess4, replace

fdause ess5e03.xpt
compress
save ess5, replace

use ess6e01_2,clear
*do ess6e01_miss.do
compress
save ess6, replace

use ess1,clear
append using ess2
append using ess3
append using ess4
append using ess5
append using ess6
compress
save essw1_w6, replace

*/

use "essw1_w6.dta", clear

* =====================================================================
* Control Variables
* =====================================================================

* Interview Year
gen year = inwyr if inwyr != 9999
replace year = inwyys if year == . & inwyys != 9999
replace year = inwyye if year == . & inwyye != 9999

tab cntry if year ==. // EE(Estonia) has all missing in survey years?

* respondent's year of birth / age
/* NOTE about agea:
Calculation based on year of birth and year of interview. 
Deviations: 
Denmark: Lower age cut-off for Danish citizen sample was 16, for non-Danish citizens 18. N
Italy, Spain, Switzerland: A few respondents are younger than the lower age cut-off. 
orway: Lower age cut off for the Norwegian sample was 16. 
For further details please see item 46 in Country Reports in the Documentation Report
*/

* R's Age
gen c_yb = yrbrn if yrbrn < 7000
gen c_age = agea if agea < 900
gen temp_age = round(age) 
replace c_age = temp_age if ~missing(temp_age) & age != 999

gen c_gapyear = c_yb+c_age
tab c_gapyear year // okay I will use year instead of gapyear from now on
replace year = c_gapyear if essround == 5 & cntry == "EE" // recover EE's year from age information

* Impute missing years from ESS round information
replace year = 2002 if year == . & essround == 1
replace year = 2004 if year == . & essround == 2
replace year = 2006 if year == . & essround == 3
replace year = 2008 if year == . & essround == 4
replace year = 2010 if year == . & essround == 5
replace year = 2012 if year == . & essround == 6


* R's gender
gen c_gender = gndr if gndr != 9
replace c_gender = . if missing(c_gender)

gen c_female = c_gender==2 if ~missing(c_gender)
gen c_educ = eduyrs if eduyrs < 76
gen c_born = (brncntr == 1) if brncntr < 6


* =============================================================
* Identifying all children in the household roster
* =============================================================

* ESS round 1 -- variable name : rship* | other ESS round = rshipa*
forvalues i = 2/15{
replace rshipa`i' = rship`i' if essround == 1 & rship`i' < 6
}

* Detect whether each member is R's child
forvalues i = 2/24 {
gen x_child`i' = (rshipa`i'==2) if ~missing(rshipa`i') & rshipa`i' < 6
}
egen n_child = rowtotal(x_child*)
label var n_child "N child in home"

* Detect whether each child is daughter
forvalues i = 2/24{
gen x_sexmissing`i' = (gndr`i' > 6 | missing(gndr`i')) if x_child`i' == 1
gen x_daughter`i' = (gndr`i' == 2) if x_child`i' == 1 & x_sexmissing`i' == 0
gen x_son`i' = (gndr`i' == 1) if x_child`i' == 1 & x_sexmissing`i' == 0
gen x_agemissing`i' = (yrbrn`i' > 6000 | missing(yrbrn`i')) if x_child`i' == 1
gen x_by`i' = (yrbrn`i') if x_child`i' == 1 & x_agemissing`i' == 0
}

* oldest child's age in household
egen x_oldest_by = rowmin(x_by*)
gen x_oldage = year- x_oldest_by

* Generational Age Gap
gen x_gengap = c_age - x_oldage

* Same oldest age in household?
forvalues i = 2/24{
gen x_oldsameage`i' = 1 if x_oldest_by == x_by`i' & ~missing(x_by`i') & ~missing(x_oldest_by)
}
egen old_sameage = rowtotal(x_oldsameage*)

* N of daughters in household
egen nx_daughter = anycount(x_daughter*), values(1)
egen nx_son = anycount(x_son*), values(1)

gen nx_child = nx_daughter+nx_son

* % daughters
gen px_daughter = (nx_daughter)/ nx_child

* Indicator for the sex of the oldest child
gen fx_daughter = 0 if nx_child != 0 & ~missing(nx_child)
forvalues i = 2/24{
replace fx_daughter = x_daughter`i' if x_oldest_by == x_by`i' & ~missing(x_by`i') & ~missing(x_oldest_by)
}

* Ever had chlidren living in the househld?
gen c_everchild = chldhhe == 1 if chldhhe < 7

* Household Size
gen x_hsize = hhmmb if hhmmb < 70 // # of people living regularly as a meber of household


* country code;
encode cntry, gen(cnum)

* R's socio-economic status based on Ganzboom's code 
* ESS conversion : occupation codes to socio-economic-indicator
* from ISCO to ISEI score 
* http://www.harryganzeboom.nl/ESS-DEVO/index.htm
do "ESS_Occupation_SEIscores.do"
gen c_sei = isei


* =====================================================================
* Dependent Variables
* =====================================================================

* Placement on left-right scale
recode lrscale (77/max=.) (missing=.)
gen d_conscale = lrscale - 5
gen d_conscale100 = 100 * d_conscale / 5

* DV for UK sample only : partyid -- prtclgb prtclagb
* conservative / labour / liberal democrats / scottish national party 
* plaid cymru, green party, other parties, other answer, don't know/no answer
recode prtclgb (1=1) (2=2) (3=3) (4=4) (5=5) (6=6) (7=7) (8/22=8) (66/99=10), gen(partyid1)
recode prtclagb (1=1) (2=2) (3=3) (4=4) (5=5) (6=6) (7=7) (8/22=8) (66/99=10), gen(partyid2)
gen partyid = partyid1
replace partyid = partyid2 if partyid == .
replace partyid = 9 if clsprty == 2

tab partyid if cntry == "GB" & partyid < 9
recode partyid (1=0) (2/3=1) (else=.), gen(d_dem)
recode partyid (1=1) (2/3=0) (else=.), gen(d_rep)

gen d_repscale = .
replace d_repscale = 4-prtdgcl if d_rep == 1 & prtdgcl < 6
replace d_repscale = -(4-prtdgcl) if d_rep == 0 & prtdgcl < 6


* =====================================================================
* SAMPLE restriction
* =====================================================================

gen out_sample = 0
replace out_sample = 1 if nx_child == 0
replace out_sample = 2 if out_sample == 0 & x_gengap < 10
replace out_sample = 3 if out_sample == 0 & old_sameage > 1

egen x_mage = rownonmiss(x_by*)
replace x_mage = x_mage / nx_child
egen x_msex = rownonmiss(x_daughter*)
replace x_msex = x_msex / nx_child
replace out_sample = 4 if out_sample == 0 & (x_mage < 1 | x_msex < 1)

* SAMPLE Sensitivity check
replace out_sample = 98 if out_sample == 0 & x_oldage > 16
recode out_sample 0=100

#delimit ;
label define out_sample
1 "R has no children"
2 "Generational age gap < 10"
3 "R has two or more children who are in the same age"
4 "Missing in any child's age or gender"
5 "Missing in party ID" 
98 "R's oldest child is older than 16"
99 "# of bio kids != # of cohab-kids"
100 "Analytic Sample"
;
#delimit cr

label value out_sample out_sample

tab out_sample

gen id = idno

local cv x_oldage x_gengap x_hsize c_age c_female c_born c_sei c_educ
local dv d_conscale d_conscale100 d_rep d_dem d_repscale
local iv fx_daughter nx_daughter px_daughter nx_child 
local etc out_sample year id dweight pweight cntry cnum essround

keep `cv' `cv2' `dv' `iv' `etc'
gen dataset = "ESS"
sort essround cntry id
gen pid = _n

compress
save ess_daughter_analytic, replace

* =====================================================================
* Combining two data set
* =====================================================================

use "gss_daughter_analytic.dta",clear
append using "ess_daughter_analytic.dta"

gen period = year if dataset == "GSS"
replace period = essround if dataset == "ESS"

replace cntry = "US" if dataset == "GSS"
replace cnum = 36 if cntry == "US"

gen wt = wtssall if dataset == "GSS"
replace wt = dweight if dataset == "ESS"

label var nx_child "N of children"

save daughter_analytic, replace

saveold daughter_analytic_bart, replace 
